import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.backend import BaseEmbedder
from umap import UMAP
import spacy
import re
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from glob import glob
def preprocess_text(text):
text = re.sub(r'[^\w\s]', '', text)
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
return ' '.join(tokens)
def split_text(text, max_length=100):
words = text.split()
return [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]
path = r"C:\Users\Cezary\Documents\Monita-privata\data\konferencja-poznan\txt/"
txt_files = [f for f in glob(f"{path}*", recursive=True)]
txt_dict = {}
for txt_file in tqdm(txt_files):
text_key = txt_file.split('\\')[-1].split('.')[0]
with open(txt_file, 'rt', encoding='utf-8') as f:
text_value = f.read()
txt_dict.update({text_key: text_value})
texts = list(txt_dict.values())
processed_texts = [preprocess_text(text) for text in tqdm(texts)]
split_texts = []
for text in tqdm(processed_texts):
split_texts.extend(split_text(text))
split_texts = [text for text in split_texts if text.strip() != '']
if len(split_texts) < 2:
raise ValueError("Niewystarczająca liczba tekstów po przetwarzaniu wstępnym. Dodaj więcej danych wejściowych.")
100%|███████████████████████████████████████████████████████████████████████████████| 322/322 [00:02<00:00, 156.55it/s] 100%|████████████████████████████████████████████████████████████████████████████████| 322/322 [04:32<00:00, 1.18it/s] 100%|██████████████████████████████████████████████████████████████████████████████| 322/322 [00:00<00:00, 9714.48it/s]
stop_words = list(stopwords.words('english'))
# sentence_model = SentenceTransformer("allegro/herbert-base-cased")
sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
class EnglishEmbedder(BaseEmbedder):
def __init__(self, embedding_model):
self.embedding_model = embedding_model
def embed(self, documents, verbose=False):
return self.embedding_model.encode(documents, show_progress_bar=verbose)
english_embedder = EnglishEmbedder(sentence_model)
vectorizer_model = CountVectorizer(stop_words=stop_words, ngram_range=(1, 2))
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.1, metric='cosine')
topic_model = BERTopic(
embedding_model=english_embedder,
vectorizer_model=vectorizer_model,
umap_model=umap_model,
top_n_words=10,
n_gram_range=(1, 2),
min_topic_size=10,
calculate_probabilities=True
)
try:
topics, probabilities = topic_model.fit_transform(split_texts)
except ValueError as e:
print(f"Error during model fitting: {e}")
print("Texts:", split_texts)
raise
print(topic_model.get_topic_info())
topic_info = topic_model.get_topic_info()
topic_info.to_excel('jojs_topics_info.xlsx', index=False)
Topic Count Name
0 -1 1996 -1_jesuit_de_study_book \
1 0 262 0_environmental_access june_june_social
2 1 156 1_chinese_china_ricci_jesuit
3 2 99 2_japanese_japan_buddhism_kirishitanban
4 3 99 3_music_musical_fisher_song
.. ... ... ...
77 76 10 76_canada_regis_regis college_college
78 77 10 77_omalley_history_john_philosophy
79 78 10 78_da_et_de_gama
80 79 10 79_object_chapter_image_painting
81 80 10 80_hopkinss_fancy_poetic_poet
Representation
0 [jesuit, de, study, book, jesuits, journal, je... \
1 [environmental, access june, june, social, sj,...
2 [chinese, china, ricci, jesuit, matteo, riccis...
3 [japanese, japan, buddhism, kirishitanban, xav...
4 [music, musical, fisher, song, munich, jesuit,...
.. ...
77 [canada, regis, regis college, college, englis...
78 [omalley, history, john, philosophy, john omal...
79 [da, et, de, gama, fernão guerreiro, guerreiro...
80 [object, chapter, image, painting, artist, wor...
81 [hopkinss, fancy, poetic, poet, read, poem, wo...
Representative_Docs
0 [Open Access Takao open access article distrib...
1 [journal jesuit study UNbrillcomjjs Ignacimuth...
2 [Publishers r Pochia Hsia Jesuit Forbidden Cit...
3 [contribute study globalization Jesuit intelle...
4 [condition Second scene Cardinal Altamirano wa...
.. ...
77 [Book Reviews journal jesuit study UN Joseph B...
78 [John W OMalley begin volume contribution enti...
79 [gogica Societatis Iesu ed Ladislaus Lukács vo...
80 [Lutheranism devotional image Erzgebirge Luthe...
81 [Catholic literal truth Lords word learn fragm...
[82 rows x 5 columns]
topic_model.visualize_topics()
topic_model.visualize_hierarchy()
topic_model.visualize_barchart()
topic_model.visualize_heatmap()